#导入re库文件
import re
import requests
old_url = 'http://xagx.zuel.edu.cn/tzgg/list1.htm'
total_page = 18

f = open('text','r',encoding='utf-8')
html = f.read()
f.close()

#获取标题
# title=re.findall('<title>(.*?)</title>',html)[0]
# #title = re.search('<title>(.*?)</title>',html).group(1)
# print(title)

#获取链接
links = re.findall('href="(.*?)"',html,re.S)
for each in links:
    print(each)

#获取学院信息,先大再小
##text_fied = re.findall('<ul>(.*?)</ul>',html,re.S)[0]
##the_text = re.findall('">(.*?)</a>',text_fied,re.S)
##for every_text in the_text:
##  print(every_text)

#sub实现翻页
for i in range(1,total_page+1):
    new_link=re.sub('list(.*?).htm','list%s.htm'%i,old_url)
    # print(new_link)
    page=requests.get(new_link)
    page.encoding='utf-8'
    html=page.text
    print("第"+str(i)+"页")
    text_field = re.findall('title=\'(.*?)\'',html,re.S)
    for title in text_field:
        print(title)
    # print(page.text)